QSRR revisited. Hierarchical models

Author
Affiliation

Paweł Wiczling*

Department of Biopharmaceutics and Pharmacodynamics, Medical University of Gdańsk, Gen. J. Hallera 107, 80-416 Gdańsk, Poland

Published

April 26, 2025

Introductions

Setup

Code
library(ggplot2)
library(gridExtra)
library(patchwork)
library(tidyr)
library(dplyr)
library(GGally)
library(reshape2)
library(pracma)
library(here)
library(magick)
library(reticulate)
library(kableExtra)
require(visNetwork, quietly = TRUE)
library(igraph)
#remotes::install_github("metrumresearchgroup/mrgmisc")
library(tidyverse)
library(patchwork)

set.seed(10271998) ## not required but assures repeatable results

source("helper-functions.R")
Code
data_deliv_dir = here::here("data","deliv")
data_derived_dir = here::here("data","derived")
if(!file.exists(data_deliv_dir)) dir.create(data_deliv_dir, recursive = T)
if(!file.exists(data_derived_dir)) dir.create(data_derived_dir, recursive = T)

figures_eda_dir <- here::here("deliv","figures", "eda")
tables_eda_dir <- here::here("deliv","tables", "eda")
if(!file.exists(figures_eda_dir)) dir.create(figures_eda_dir, recursive = T)
if(!file.exists(tables_eda_dir)) dir.create(tables_eda_dir, recursive = T)

model_dir <- here::here("model","stan")  
figures_dir <- here::here("deliv","figures", "stan")
tables_dir <- here::here("deliv","tables", "stan")
if(!file.exists(figures_dir)) dir.create(figures_dir, recursive = T)
if(!file.exists(model_dir)) dir.create(model_dir, recursive = T)
if(!file.exists(tables_dir)) dir.create(tables_dir, recursive = T)

EDA

We used a publicly available dataset that comprises the measurements of RP-HPLC retention times collected for 1026 analytes. The retention times were measured under isocratic conditions on Eclipse Plus C18 (Agilent) stationary phase with 3.5 μm particles. The experiments were conducted using a mixture of two solvents: solvent A, which was made of 0.1% formic acid in water, and solvent B, which was made of 0.1% formic acid in acetonitrile. The column temperature was set at 35^{}C. The data were collected by Boswell et al. and were used to create a method to predict retention time by Back-Calculating the Gradient.

Load data

Code
data <- read.csv(here(data_deliv_dir, "database_logk_1026.csv"), header = TRUE)
analytes_names  <- read.csv(here(data_deliv_dir,"database_logk_1026_analyte_names.csv"), header = TRUE)
smiles <- read.csv(here(data_deliv_dir,"smiles1026.smi"), sep = "\t", header = FALSE)
functional_groups = read.csv(here(data_deliv_dir, 'checkmol_functional_groups.csv'))
functional_groups_names = read.csv(here(data_deliv_dir,'checkmol_functional_group_names.csv'))
data_ACD = read.csv(here(data_deliv_dir,'ACD_pKas.csv'))
data_pH <- read.csv(here(data_deliv_dir,"pH.csv"),header = TRUE)
results <-read.csv(here(data_deliv_dir,"results_table.csv"))

#similarity_matrix <- make_similarity_matrix_fun(lower_tri_df)
smiles<-smiles %>% rename(ID=V2,smiles=V1) %>% select(ID,smiles)
smiles$smiles[905] = "CN(C1CCCCC1N1CCCC1)C(=O)Cc1ccc(c(c1)Cl)Cl" # remove tartrate moiety 
smiles$smiles[425] = "CC(Cc1ccc(cc1)OCC(=O)O)NCC(c1cccc(c1)Cl)O" # remove Na+ and dissociation

analytes_names$Analyte <- iconv(analytes_names$Analyte, from = "", to = "UTF-8", sub = "byte")

Python

Code
use_condaenv("rdkit-env", conda = "C:/Users/GUMed/anaconda3/Scripts/conda.exe", required = TRUE)
Code
Chem <- import("rdkit.Chem")
AllChem <- import("rdkit.Chem.AllChem")
Draw <- import("rdkit.Chem.Draw")
rdFMCS <- import("rdkit.Chem.rdFMCS")
rdmolops <- import("rdkit.Chem.rdmolops")

# Create MCS parameters
mcs_params <- rdFMCS$MCSParameters()
mcs_params$AtomCompareParameters$MatchValences <- TRUE 
mcs_params$AtomCompareParameters$RingMatchesRingOnly <- TRUE  
mcs_params$AtomCompareParameters$Compare <- rdFMCS$AtomCompare$CompareElements
mcs_params$BondCompareParameters$CompleteRingsOnly <- TRUE 
mcs_params$BondCompareParameters$Compare <- rdFMCS$BondCompare$CompareOrder
mcs_params$MaximizeBonds <- TRUE
mcs_params$Timeout <- 20L           
#mcs_params$Verbose <- FALSE
mcs_params$Threshold <- 1 

Maximum Common Substructure

Code
smiles_vec <- smiles$smiles
mols <- lapply(smiles_vec, function(smi) Chem$MolFromSmiles(smi))

n <- length(mols)
pair_count <- choose(n, 2)

# Pre-allocate result list
results_list <- vector("list", pair_count)
pair_idx <- 1

# Faster nested loop with list output
for (i in 1:(n - 1)) {
  mol1 <- mols[[i]]
  for (j in (i + 1):n) {
    mol2 <- mols[[j]]
    
    # Skip if any molecule is NULL
    if (is.null(mol1) || is.null(mol2)) {
      mcs_size <- NA
    } else {
      mcs_result <- rdFMCS$FindMCS(list(mol1, mol2), parameters = mcs_params)
      mcs_size <- mcs_result$numAtoms
    }

    results_list[[pair_idx]] <- list(
      mol1_index = i,
      mol2_index = j,
      mcs_size = mcs_size
    )
    pair_idx <- pair_idx + 1
  }
}

# Convert to data.frame once
results <- do.call(rbind, lapply(results_list, as.data.frame))

write.csv(results, file = here(data_deliv_dir,"results_table.csv"), row.names = FALSE)

Subsets

Code
# Function to get number of atoms from SMILES
get_atom_count <- function(smiles) {
  mol <- Chem$MolFromSmiles(smiles)
  return(mol$GetNumAtoms())
}

atom_count <- unlist(lapply(smiles$smiles, get_atom_count))
results$mol1_count = atom_count[results$mol1_index]
results$mol2_count = atom_count[results$mol2_index]

results_selected <- results %>%
  mutate(crit = mol1_count + mol2_count - 2*mcs_size)%>%
  group_by(mol1_index) %>%
  slice(which.min(crit))

MCS Subsets

Code
# Extract the vectors
rows <- smiles$smiles[results_selected$mol1_index]
cols <- smiles$smiles[results_selected$mol2_index]

comparison_table <- purrr::map2_df(rows, cols, compare_smiles_pair,
                                   .progress = list(
  type = "iterator", 
  format = "Calculating {cli::pb_bar} {cli::pb_percent} {cli::pb_current}/{cli::pb_total}",
  clear = FALSE))

write.csv(comparison_table, file = here(data_deliv_dir,"comparison_table.csv"), row.names = FALSE)

Add membership groups

Code
results_selected_crit<-results_selected %>%
  filter(crit<12) 

uid = unique(c(results_selected_crit$mol1_index,results_selected_crit$mol2_index))
nodes <- data.frame(id = uid,label = paste(uid))
edges <- data.frame(from = results_selected_crit$mol1_index,
                    to = results_selected_crit$mol2_index, 
                    width =results_selected_crit$crit,
                    label = paste(results_selected_crit$crit))
g <- graph_from_data_frame(d = edges, vertices = nodes, directed = TRUE)
components <- components(g, mode = "weak")
nodes$temp <- components$membership

results_selected <- results_selected%>%
  mutate(id=mol1_index) %>%
  left_join(nodes) %>%
mutate(group12=temp) %>% select(-c(temp, label, id))

Visualise Network

Code
nodes$group = nodes$temp
visNetwork(nodes, edges, width = "100%")

Test

Code
# Extract the vectors
smiles1 <- smiles$smiles[378]
smiles2 <- smiles$smiles[613]

comparison_table <- purrr::map2_df(smiles1, smiles2, compare_smiles_pair) %>%
  select( "structure") %>%
  kbl(escape = FALSE, format = "html", align = "l") %>%
  kable_styling(full_width = FALSE, position = "center", bootstrap_options = c("striped", "hover", "condensed"))

comparison_table

Plot

Code
plotting_i = function(i, .ilist) {
  data %>% 
    filter(ID %in% c(.ilist$id1[i], .ilist$id2[i])) %>%
    ggplot(aes(x = fi, y = logk, group = ID)) + 
    geom_line(aes(color = logP_ACD)) + 
    labs(
      title = paste(analytes_names$Analyte[.ilist$id1[i]], "\n", analytes_names$Analyte[.ilist$id2[i]]), 
      x = "\u03C6", 
      y = expression(log~k[obs])
    ) + 
    theme_gray(base_size = 10) + 
    theme(legend.position = "none")+
    theme(plot.title = element_text(size = 5), 
          axis.title = element_text(size = 7),
          axis.text = element_text(size = 5))
}

list0 = comparison_table%>% filter(similarity==0)
list1 = comparison_table%>% filter(similarity==1)
list2 = comparison_table%>% filter(similarity==2)
list10 = comparison_table%>% filter(similarity==10)
map(1:min(20, nrow(list0)), \(x) plotting_i(x, list0)) %>% wrap_plots(ncol = 4)

Code
map(1:min(20, nrow(list1)), \(x) plotting_i(x, list1)) %>% wrap_plots(ncol = 4)

Code
map(1:min(20, nrow(list2)), \(x) plotting_i(x, list2)) %>% wrap_plots(ncol = 4)

Code
map(1:min(20, nrow(list10)), \(x) plotting_i(x, list10)) %>% wrap_plots(ncol = 4)

Session info

Code
sessionInfo()
R version 4.3.1 (2023-06-16 ucrt)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 19045)

Matrix products: default


locale:
[1] LC_COLLATE=Polish_Poland.utf8  LC_CTYPE=Polish_Poland.utf8   
[3] LC_MONETARY=Polish_Poland.utf8 LC_NUMERIC=C                  
[5] LC_TIME=Polish_Poland.utf8    

time zone: Europe/Warsaw
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] lubridate_1.9.3   forcats_1.0.0     stringr_1.5.1     purrr_1.0.2      
 [5] readr_2.1.5       tibble_3.2.1      tidyverse_2.0.0   igraph_2.1.4     
 [9] visNetwork_2.1.2  kableExtra_1.4.0  reticulate_1.42.0 magick_2.8.6     
[13] here_1.0.1        pracma_2.4.4      reshape2_1.4.4    GGally_2.2.1     
[17] dplyr_1.1.4       tidyr_1.3.1       patchwork_1.2.0   gridExtra_2.3    
[21] ggplot2_3.5.1    

loaded via a namespace (and not attached):
 [1] gtable_0.3.5       xfun_0.44          htmlwidgets_1.6.4  lattice_0.21-8    
 [5] tzdb_0.4.0         vctrs_0.6.5        tools_4.3.1        generics_0.1.3    
 [9] fansi_1.0.6        pkgconfig_2.0.3    Matrix_1.6-5       RColorBrewer_1.1-3
[13] lifecycle_1.0.4    farver_2.1.2       compiler_4.3.1     textshaping_0.4.0 
[17] munsell_0.5.1      htmltools_0.5.8.1  yaml_2.3.8         pillar_1.9.0      
[21] ggstats_0.8.0      tidyselect_1.2.1   digest_0.6.35      stringi_1.8.4     
[25] labeling_0.4.3     rprojroot_2.0.4    fastmap_1.2.0      grid_4.3.1        
[29] colorspace_2.1-0   cli_3.6.2          magrittr_2.0.3     utf8_1.2.4        
[33] withr_3.0.2        scales_1.3.0       timechange_0.3.0   rmarkdown_2.27    
[37] ragg_1.3.2         png_0.1-8          hms_1.1.3          evaluate_1.0.3    
[41] knitr_1.46         viridisLite_0.4.2  rlang_1.1.3        Rcpp_1.0.12       
[45] glue_1.7.0         xml2_1.3.6         svglite_2.1.3      rstudioapi_0.16.0 
[49] jsonlite_1.8.8     R6_2.5.1           plyr_1.8.9         systemfonts_1.1.0